/* Export the non-imputed data for the FHS industries to .csv format for CART imputation. */

/** Modified 03-18-2015 to use the files for which ALL imputes (not just PQS imputes) have been replaced with missing values;
                       Also changed the output directory to go to the subfolder /rev4 (4th revision for REStat)
**/

%include "ASMimplibs.sas";


%macro select_and_export(prod,ind,lastyr);

* prod   =  product name;

/* For the ratio variables created below, if numerator is observed and denominator is missing, we will use the observed value
   when we construct the regression variables after imputation.  If the numerator is missing, 
   then we will multiply the imputed ratio by the denominator to compute the numerator variable 
   (regardless of whether the denominator is imputed or not).  Thus the imputed numerator variables 
    are never greater than the denominator variables (because we never observe that in the edited data,
    and the CART is drawing ratios from the observed data). 
*/

data &prod._gooddata (keep = ppn id year tvs pqs pv plant_cm cp cf ee cenergy energycmratio cpcmratio wwswratio pvtvsratio  ksst kseq sw te ph ww pw fdeath pqs_imp_mean);
  set db50im.all&prod;
   cenergy = sum(cf,ee);
   energycmratio = cenergy/plant_cm;
   cpcmratio = cp/plant_cm;
   wwswratio = ww/sw;
   pvtvsratio = pv/tvs;
run;


 proc sort data=&prod._gooddata; by year ppn; run;

 proc means data= &prod._gooddata N NMISS min mean max; 
 by year;
 var tvs pqs pv plant_cm cp cf ee cenergy ksst kseq sw te ph ww pw ;
 title "Non-imputed data for &prod";
 run;

 /* Save the ppn's  for re-merging after CART imputes. */
 data fhs7797.&prod._gooddata_ids (keep = number ppn id pqs_imp_mean ); 
  set &prod._gooddata;
  number = _N_;
 run;

 data &prod._gooddata (keep = year tvs pqs plant_cm energycmratio cpcmratio wwswratio pvtvsratio ksst kseq sw te ph fdeath) ;
 set &prod._gooddata;
 run;

 PROC EXPORT DATA= &prod._gooddata 
  OUTFILE= "&prod._gooddata.csv" 
  DBMS=CSV REPLACE;
  PUTNAMES=YES;
 RUN;

%mend;



/******************************/
/* single-product industries: */
/******************************/

%select_and_export(carbonf,2895,1997);
%select_and_export(flrf,2426,1997);
%select_and_export(gasf,2911,1997);
%select_and_export(icebf,2097,1997);
%select_and_export(icepf,2097,1987);
**%select_and_export(sugf,2061,1997);  /* I'm not doing any CART-imputations for sugar. */

/******************************/
/* Other industries that I'm treating as single-product industries: */
/******************************/

/** Bread has two products in 1992 and 1997, but only a few observations for one of the products.
    In 1977-1987 bread has only one product. **/

%select_and_export(bredf);
%select_and_export(cofff);  
%select_and_export(plyf,2435,1997);
%select_and_export(boxf,);


/** For concrete, create separate files for each year: **/


data concf_gooddata (keep = ppn id year tvs pqs pv plant_cm cp cf ee cenergy energycmratio cpcmratio wwswratio pvtvsratio  ksst kseq sw te ph ww pw fdeath pqs_imp_mean);       
  set db50im.allconcf;
   cenergy = sum(cf,ee);
   energycmratio = cenergy/plant_cm;
   cpcmratio = cp/plant_cm;
   wwswratio = ww/sw;
   pvtvsratio = pv/tvs;
 run;


 proc sort data=concf_gooddata; by year ppn; run;

 proc means data= concf_gooddata N NMISS min mean max; 
 by year;
  var tvs pqs pv plant_cm cp cf ee cenergy ksst kseq sw te ph ww pw ;
 title "Non-imputed data for concf (concrete)";
 run;

 /* Save the ppn's for re-merging after CART imputes. */


 data concf77_gooddata_ids concf82_gooddata_ids concf87_gooddata_ids concf92_gooddata_ids;
 set concf_gooddata (keep = ppn year id pqs_imp_mean);
	if year=1977 then output concf77_gooddata_ids;
	else if year=1982 then output concf82_gooddata_ids;
	else if year=1987 then output concf87_gooddata_ids;
	else if year=1992 then output concf92_gooddata_ids;
 run;

 data fhs7797.concf77_gooddata_ids (keep = number ppn year id  pqs_imp_mean); 
  set concf77_gooddata_ids;
  number = _N_;
 run;

 data fhs7797.concf82_gooddata_ids (keep = number ppn year id   pqs_imp_mean); 
  set concf82_gooddata_ids;
  number = _N_;
 run;

 data fhs7797.concf87_gooddata_ids (keep = number ppn year id   pqs_imp_mean); 
  set concf87_gooddata_ids;
  number = _N_;
 run;

 data fhs7797.concf92_gooddata_ids (keep = number ppn year id   pqs_imp_mean); 
  set concf92_gooddata_ids;
  number = _N_;
 run;
                               
 data concf77_gooddata (keep = tvs pqs plant_cm energycmratio cpcmratio wwswratio pvtvsratio ksst kseq sw te ph fdeath ) 
      concf82_gooddata (keep = tvs pqs plant_cm energycmratio cpcmratio wwswratio pvtvsratio ksst kseq sw te ph fdeath )
      concf87_gooddata (keep = tvs pqs plant_cm energycmratio cpcmratio wwswratio pvtvsratio ksst kseq sw te ph fdeath )
      concf92_gooddata (keep = tvs pqs plant_cm energycmratio cpcmratio wwswratio pvtvsratio ksst kseq sw te ph fdeath );
 set concf_gooddata ;
	if year=1977 then output concf77_gooddata;
	else if year=1982 then output concf82_gooddata;
	else if year=1987 then output concf87_gooddata;
	else if year=1992 then output concf92_gooddata;
 run;

 PROC EXPORT DATA= concf77_gooddata 
  OUTFILE= "concf77_gooddata.csv" 
  DBMS=CSV REPLACE;
  PUTNAMES=YES;
 RUN;

 PROC EXPORT DATA= concf82_gooddata 
  OUTFILE= "concf82_gooddata.csv" 
  DBMS=CSV REPLACE;
  PUTNAMES=YES;
 RUN;

 PROC EXPORT DATA= concf87_gooddata 
  OUTFILE= "concf87_gooddata.csv" 
  DBMS=CSV REPLACE;
  PUTNAMES=YES;
 RUN;

 PROC EXPORT DATA= concf92_gooddata 
  OUTFILE= "concf92_gooddata.csv" 
  DBMS=CSV REPLACE;
  PUTNAMES=YES;
 RUN;



